{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### 增加数据\n",
    "\n",
    "降低模型方差的最直接方法就是增加训练数据(但注意验证集和测试集合与之前保持一致)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "\n",
    "class DataBinWrapper(object):\n",
    "    def __init__(self, max_bins=10):\n",
    "        # 分段数\n",
    "        self.max_bins = max_bins\n",
    "        # 记录x各个特征的分段区间\n",
    "        self.XrangeMap = None\n",
    "\n",
    "    def fit(self, x):\n",
    "        n_sample, n_feature = x.shape\n",
    "        # 构建分段数据\n",
    "        self.XrangeMap = [[] for _ in range(0, n_feature)]\n",
    "        for index in range(0, n_feature):\n",
    "            tmp = sorted(x[:, index])\n",
    "            for percent in range(1, self.max_bins):\n",
    "                percent_value = np.percentile(tmp, (1.0 * percent / self.max_bins) * 100.0 // 1)\n",
    "                self.XrangeMap[index].append(percent_value)\n",
    "            self.XrangeMap[index] = sorted(list(set(self.XrangeMap[index])))\n",
    "\n",
    "    def transform(self, x):\n",
    "        \"\"\"\n",
    "        抽取x_bin_index\n",
    "        :param x:\n",
    "        :return:\n",
    "        \"\"\"\n",
    "        if x.ndim == 1:\n",
    "            return np.asarray([np.digitize(x[i], self.XrangeMap[i]) for i in range(0, x.size)])\n",
    "        else:\n",
    "            return np.asarray([np.digitize(x[:, i], self.XrangeMap[i]) for i in range(0, x.shape[1])]).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import lightgbm as lgb\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, auc\n",
    "from pre_process.pre_process_wy1_beforejt3000_joincx import *\n",
    "import toad\n",
    "from sklearn import metrics\n",
    "diretory=\"xxx\"\n",
    "test0 = pd.read_csv(diretory+'xxx1.csv', low_memory=False)\n",
    "#追加部分新数据\n",
    "other_df = pd.read_csv(diretory+'xxx2.csv', low_memory=False)\n",
    "other_df=other_df[other_df['effdate'].apply(lambda x:x[:7]<=\"2019-10\")]\n",
    "test0=pd.concat([test0,other_df])\n",
    "\n",
    "test0 = test0[(test0['dpt'] == 217) & (test0['renew_flag'] == 1)]\n",
    "\n",
    "claim = pd.read_csv(diretory+'/data/claimnum.csv')\n",
    "claim = claim[['polnum_com', 'num']]\n",
    "claim = claim.rename(columns={'polnum_com': 'policy_no'})\n",
    "test0=pd.merge(test0,claim,on=\"policy_no\",how=\"inner\")\n",
    "test0['y'] = np.where(test0['num']>0, 1, 0)\n",
    "test0=test0.drop(\"num\",axis=1)\n",
    "dat1=test0.copy()\n",
    "#因子筛选\n",
    "dat1 = ppcess_filterCols(dat1, cols='join')\n",
    "#去掉饱和度过低的因子 by 80%\n",
    "na_cols = []\n",
    "for c in dat1.columns:\n",
    "    if dat1[c].isnull().sum()/len(dat1)>0.8:\n",
    "        na_cols.append(c)\n",
    "dat1 = dat1.drop(na_cols, axis=1)\n",
    "jt_cx_cols = ['xxx']\n",
    "dat1 = dat1.drop(jt_cx_cols, axis=1)\n",
    "dat1 = ppcess_dateCols(dat1,print_process_cols=False) # 日期因子格式\n",
    "dat1 = ppcess_toNum(dat1,print_process_cols=False) # 部分因子转换为数值型\n",
    "dat1 = ppcess_delCols_na(dat1,print_process_cols=False) # 删去饱和度低于1%的因子\n",
    "dat1 = ppcess_inpute(dat1,print_process_cols=False) # 部分因子按规则进行空值填充，数值型因子在后面做均值填充\n",
    "dat1 = ppcess_delCols_uniValue(dat1,print_process_cols=False) # 删去全部唯一值的因子\n",
    "\n",
    "dat2 = dat1.copy()\n",
    "dat2 = feature_catValues(dat2,print_process_cols=False) # 修正部分因子取值\n",
    "dat2 = feature_factorLabel1(dat2,print_process_cols=False) # 离散特征编码1\n",
    "dat2 = feature_factorLabel2(dat2,print_process_cols=False) # 离散特征编码2\n",
    "dat2 = feature_lambda(dat2,print_process_cols=False) # 构建一些特征\n",
    "dat2 = dat2.drop(['xxx'], axis=1)\n",
    "dat2['xxx'] = dat2['xxx'].fillna(0)\n",
    "dat2['y']=test0['y']\n",
    "#切分训练、验证、测试\n",
    "trn_df=dat2[test0['effdate'].apply(lambda x:x[:7]<=\"2019-10\")]\n",
    "\n",
    "dev_test_df=dat2[test0['effdate'].apply(lambda x:x[:7]>=\"2019-11\")]\n",
    "indice=list(range(0,dev_test_df.shape[0]))\n",
    "np.random.shuffle(indice)\n",
    "dev_test_df=dev_test_df.iloc[indice]\n",
    "dev_df=dev_test_df.iloc[:dev_test_df.shape[0]//2]\n",
    "test_df=dev_test_df.iloc[dev_test_df.shape[0]//2:]\n",
    "\n",
    "#target encoding\n",
    "object_cols=trn_df.dtypes[trn_df.dtypes==object].reset_index()['index'].tolist()\n",
    "trn_df[object_cols]=trn_df[object_cols].fillna(\"missing\")\n",
    "dev_df[object_cols]=dev_df[object_cols].fillna(\"missing\")\n",
    "test_df[object_cols]=test_df[object_cols].fillna(\"missing\")\n",
    "object_target_cols=[]\n",
    "for col in object_cols:\n",
    "    object_target_cols.append(col+\"_target\")\n",
    "    trn_df[col+\"_target\"]=trn_df[col]\n",
    "    dev_df[col+\"_target\"]=dev_df[col]\n",
    "    test_df[col+\"_target\"]=test_df[col]\n",
    "import category_encoders as ce\n",
    "le=ce.TargetEncoder(cols=object_target_cols)\n",
    "le.fit(trn_df,trn_df['y'])\n",
    "trn_df=le.transform(trn_df)\n",
    "dev_df=le.transform(dev_df)\n",
    "test_df=le.transform(test_df)\n",
    "#ordinary encoding\n",
    "oe=ce.OrdinalEncoder()\n",
    "oe.fit(trn_df,cols=object_cols)\n",
    "trn_df=oe.transform(trn_df)\n",
    "dev_df=oe.transform(dev_df)\n",
    "test_df=oe.transform(test_df)\n",
    "#分箱做一次WOE\n",
    "trn_woe_df=trn_df.drop([\"policy_no\",\"y\"],axis=1).copy()\n",
    "dev_woe_df=dev_df.drop([\"policy_no\",\"y\"],axis=1).copy()\n",
    "test_woe_df=test_df.drop([\"policy_no\",\"y\"],axis=1).copy()\n",
    "woe_cols=[item+\"_woe\" for item in trn_woe_df.columns]\n",
    "dbw=DataBinWrapper()\n",
    "dbw.fit(trn_woe_df.values)\n",
    "trn_woe_df=pd.DataFrame(data=dbw.transform(trn_woe_df.values),columns=woe_cols)\n",
    "dev_woe_df=pd.DataFrame(data=dbw.transform(dev_woe_df.values),columns=woe_cols)\n",
    "test_woe_df=pd.DataFrame(data=dbw.transform(test_woe_df.values),columns=woe_cols)\n",
    "trn_woe_df=trn_woe_df.astype(\"object\")\n",
    "dev_woe_df=dev_woe_df.astype(\"object\")\n",
    "test_woe_df=test_woe_df.astype(\"object\")\n",
    "woe_encoder=ce.WOEEncoder()\n",
    "woe_encoder.fit(trn_woe_df,trn_df['y'],cols=woe_cols)\n",
    "trn_woe_df=woe_encoder.transform(trn_woe_df)\n",
    "dev_woe_df=woe_encoder.transform(dev_woe_df)\n",
    "test_woe_df=woe_encoder.transform(test_woe_df)\n",
    "trn_df=pd.concat([trn_df.reset_index(),trn_woe_df.reset_index()],axis=1).drop([\"index\"],axis=1)\n",
    "dev_df=pd.concat([dev_df.reset_index(),dev_woe_df.reset_index()],axis=1).drop([\"index\"],axis=1)\n",
    "test_df=pd.concat([test_df.reset_index(),test_woe_df.reset_index()],axis=1).drop([\"index\"],axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "训练模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#自定义metrics\n",
    "def eval_function(y_true,y_pred):\n",
    "    try:\n",
    "        y_pred=y_pred.get_label()\n",
    "    except:\n",
    "        pass\n",
    "    sort_indice=np.argsort(y_pred)[::-1]\n",
    "    metric_value=y_true[sort_indice[:int(0.05*len(y_true))]].mean()\n",
    "    return \"eval_function\",metric_value,True\n",
    "def eval(y_true,y_pred):\n",
    "    return np.round(eval_function(y_true,y_pred)[1]/eval_function(y_true,y_true)[1],2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 调参推荐：https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html\n",
    "# 超参\n",
    "params = {\n",
    "'boosting_type':'gbdt',#集成方式，还包括rf,dart,goss等\n",
    "'objective':'binary',\n",
    "'metric':\"eval_function\",\n",
    "'learning_rate':0.01,#学习率\n",
    "'max_depth':9,#单颗树的最大深度\n",
    "'num_leaves':500,#叶子节点数\n",
    "'max_bins':255,#分箱数s\n",
    "'lambda_l1':1e-4,#l1正则化权重\n",
    "'lambda_l2':1e-4,#l2正则化权重\n",
    "'min_data_in_leaf':5,\n",
    "'bagging_freq':5,\n",
    "'bagging_fraction':0.5\n",
    "}\n",
    "\n",
    "trn_x = trn_df.drop(['policy_no','y'], axis=1)\n",
    "trn_y = trn_df['y']\n",
    "\n",
    "val_x = dev_df.drop(['policy_no','y'], axis=1)\n",
    "val_y = dev_df['y']\n",
    "\n",
    "trn_data = lgb.Dataset(trn_x, trn_y,categorical_feature=object_cols)\n",
    "val_data = lgb.Dataset(val_x, val_y,categorical_feature=object_cols)\n",
    "\n",
    "reg = lgb.train(params = params,\n",
    "                train_set = trn_data,\n",
    "                num_boost_round = 500,#最大树数量\n",
    "                early_stopping_rounds = 100,#如何验证集效果在20轮中没有明显变好，就终止\n",
    "                feval=eval_function,\n",
    "                valid_sets = [val_data])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练集预测结果 0.72\n",
      "验证集预测结果 0.5\n",
      "测试集预测结果 0.5\n"
     ]
    }
   ],
   "source": [
    "test_x = test_df.drop(['policy_no','y'], axis=1)\n",
    "test_y = test_df['y']\n",
    "print(\"训练集预测结果\",eval(trn_y.values,reg.predict(trn_x)))\n",
    "print(\"验证集预测结果\",eval(val_y.values,reg.predict(val_x)))\n",
    "print(\"测试集预测结果\",eval(test_y.values,reg.predict(test_x)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这时,    \n",
    "偏差=1.0-0.72=0.28  \n",
    "方差=0.72-0.5=0.22  \n",
    "方差下降了大概15%，偏差没变，这里需要注意一个问题，并不是无脑添加数据，模型效果就能得到提升，模型效果与训练样本容量通常有如下规律     \n",
    "![avatar](./pic/2.jpg)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
